In [1]:

import matplotlib.pyplot as plt
from IPython.core import display

Can we scrape HN? https://news.ycombinator.com/item?id=1721105

A little setup before we get going¶

In [2]:

posts_savefile = 'posts.csv'
tdm_savefile = 'posts_tdm.csv'

urls = (
(2011, 1, 'https://news.ycombinator.com/item?id=2057704'),
(2011, 2, 'https://news.ycombinator.com/item?id=2161360'),
(2011, 3, 'https://news.ycombinator.com/item?id=2270790'),
(2011, 4, 'https://news.ycombinator.com/item?id=2396027'),
(2011, 5, 'https://news.ycombinator.com/item?id=2503204'),
(2011, 6, 'https://news.ycombinator.com/item?id=2607052'),
(2011, 7, 'https://news.ycombinator.com/item?id=2719028'),
(2011, 8, 'https://news.ycombinator.com/item?id=2831646'),
(2011, 9, 'https://news.ycombinator.com/item?id=2949787'),
(2011, 10, 'https://news.ycombinator.com/item?id=3060221'),
(2011, 11, 'https://news.ycombinator.com/item?id=3181796'),
(2011, 12, 'https://news.ycombinator.com/item?id=3300290'),
(2012, 1, 'https://news.ycombinator.com/item?id=3412900'),
(2012, 2, 'https://news.ycombinator.com/item?id=3537881'),
(2012, 3, 'https://news.ycombinator.com/item?id=3652041'),
(2012, 4, 'https://news.ycombinator.com/item?id=3783657'),
(2012, 5, 'https://news.ycombinator.com/item?id=3913997'),
(2012, 6, 'https://news.ycombinator.com/item?id=4053076'),
(2012, 7, 'https://news.ycombinator.com/item?id=4184755'),
(2012, 8, 'https://news.ycombinator.com/item?id=4323597'),
(2012, 9, 'https://news.ycombinator.com/item?id=4463689'),
(2012, 10, 'https://news.ycombinator.com/item?id=4596375'),
(2012, 11, 'https://news.ycombinator.com/item?id=4727241'),
(2012, 12, 'https://news.ycombinator.com/item?id=4857714'),
(2013, 1, 'https://news.ycombinator.com/item?id=4992617'),
(2013, 2, 'https://news.ycombinator.com/item?id=5150834'),
(2013, 3, 'https://news.ycombinator.com/item?id=5304169'),       
(2013, 4, 'https://news.ycombinator.com/item?id=5472746'),
(2013, 5, 'https://news.ycombinator.com/item?id=5637663'),
(2013, 6, 'https://news.ycombinator.com/item?id=5803764'),
(2013, 7, 'https://news.ycombinator.com/item?id=5970187'),
(2013, 8, 'https://news.ycombinator.com/item?id=6139927'),
(2013, 9, 'https://news.ycombinator.com/item?id=6310234'),
(2013, 10, 'https://news.ycombinator.com/item?id=6475879'),
(2013, 11, 'https://news.ycombinator.com/item?id=6653437'),
(2013, 12, 'https://news.ycombinator.com/item?id=6827554'),
(2014, 1, 'https://news.ycombinator.com/item?id=6995020'),
(2014, 2, 'https://news.ycombinator.com/item?id=7162197'),
(2014, 3, 'https://news.ycombinator.com/item?id=7324236'),       
(2014, 4, 'https://news.ycombinator.com/item?id=7507765'),
(2014, 5, 'https://news.ycombinator.com/item?id=7679431')
)

def filename(year, month):
    return 'html/hn_%d_%d.html' % (year, month)

In [3]:

# maybe drop urls into a DataFrame to save to CSV?
import pandas as pd
urlsdf = pd.DataFrame(list(urls), columns=['year', 'month', 'url'])
urlsdf.head(3)

Out[3]:

	year	month	url
0	2011	1	https://news.ycombinator.com/item?id=2057704
1	2011	2	https://news.ycombinator.com/item?id=2161360
2	2011	3	https://news.ycombinator.com/item?id=2270790

Fetching HTML from Hacker News¶

Try each item at most 3 times if gettting bad responses
Wait a half minute between grabbing each year/month post
Check for 'Next' link at bottom of page, fetch link if there
Append 'next' pages to end of current HTML fetched

In [ ]:

from bs4 import BeautifulSoup
import collections
import os.path
import requests
import time

stack = collections.deque(urls)
tries = len(stack) * 3 # maximum attempts 3 times of number of URLs

while tries > 0:
    tries -= 1
    current = stack.pop()
    year, month, url = current

    # local html output file
    fname = filename(year, month)
    if os.path.isfile(fname):
        os.remove(fname)

    try:
        # get the HN pages for month / year
        ym_pages = [url]
        while ym_pages:
            url = ym_pages.pop()
            print "Fetching URL: %s" % (url)
            r = requests.get(url)

            # fail if bad error code
            if r.status_code != requests.codes.ok:
                raise Exception('Error from server: ' + str(r.status_code))

            text = r.text.replace('&', '_') # broken HTML escapes breaking BeautifulSoup, removing
            # write out to file in cwd
            with open(fname, 'a') as htmlfile:
                htmlfile.write(text.encode('utf-8'))
                
            # check for 'More' link
            soup = BeautifulSoup(text)
            links = soup.find_all('a', text='More')
            if links:
                # sometimes foward slash is being html escaped and messed
                # up by above & replacment, need to replace again
                link_url = 'https://news.ycombinator.com' + links[0]['href'].replace('_#x2F;', '/')
                ym_pages.append(link_url)
            
            # take a break for 30 seconds
            time.sleep(30)
            
    except Exception as e:
        print 'error:', e, 'currently on:', current
        # stick current URL at the begining of the queue
        stack.appendleft(current)        
        
    # get out when stack is empty
    if not stack: break

HTML parsing function¶

Parsing of the HTML from Hacker News from HTML to {user, post} dicts

In [11]:

from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re

def html_to_posts(html):
    """Parse an html document into posts"""
    posts = []
    html = html.replace("<br>", "<br/>") # unclosed <br>'s are messing up BeautifulSoup
    soup = BeautifulSoup(html)
    tables = soup.body.center.table('tr', recursive=False)[2].td('table', recursive=False)
    comments_table = tables[1] if len(tables) > 1 else tables[0]
    rows = comments_table('tr', recursive=False)
        
    for row in rows:        
        # check if td and table exist
        if not row.td or not row.td.table: continue

        # check if this is a top level comment
        spacer_img = row.td.table.tr.td.img
        if not spacer_img['width'] == '0': continue
                
        comment_tag = row.find_all('span', class_='comment')[0]
        comment = comment_tag.get_text(separator=' ')
        #print comment[:30]
        if comment == '[deleted]' or comment == '[dead]': continue
        
        head_tag = row('span', class_='comhead')[0]
        user = head_tag.a.text

        posts.append({'user': user, 'post': comment.encode('utf-8')})
        
    return posts

Looping over fetched HTML¶

Loop over fetched items
Split at </html> tags because we appended Next pages
Save out to CSV for later use

In [34]:

# urls = [(2012, 7, 'https://news.ycombinator.com/item?id=4184755')]
posts = []

for current in urls:
    # print current
    year, month, url = current
    all_html = open(filename(year, month)).read()
    
    start_html = 0
    while start_html < len(all_html):
        end_html = all_html.find('</html>', start_html)
        html = all_html[start_html:end_html + 7]
        start_html = end_html + 7
        
        ym_posts = html_to_posts(html)
        #print "Found %d posts" % len(ym_posts)

        for post in ym_posts:
            post.update({'date': pd.datetime(year, month, 1)})
            posts.append(post)


postsdf = pd.DataFrame(posts, columns=['date', 'user', 'post'])

Save¶

In [35]:

postsdf.to_csv(posts_savefile, index=False)

Read Posts DataFrame saved to CSV¶

Read in our CSV of posts
check out it's head and tail

In [4]:

postsdf = pd.read_csv(posts_savefile, parse_dates=[0])
postsdf.head(3)

Out[4]:

	date	user	post
0	2011-01-01 00:00:00	lkrubner	In New York City there are a lot of jobs. I we...
1	2011-01-01 00:00:00	jasonfried	37signals is hiring two Rails programmers:\n h...
2	2011-01-01 00:00:00	tptacek	Chicago (or remote) Matasano Security LEAD SOF...

In [5]:

postsdf.tail(3)

Out[5]:

	date	user	post
9938	2014-05-01 00:00:00	jasonlotito	MeetMe - New Hope, PA (near Philadelphia, Penn...
9939	2014-05-01 00:00:00	ssharpe67	Datalex - Atlanta, GA\nReady to use your tech ...
9940	2014-05-01 00:00:00	findwork	Disclaimer: Forgive me for posting here. I jus...

Number of posts per month¶

In [6]:

# add year and month columsn to dataframe
postsdf['year'] = [v.year for v in postsdf.date]
postsdf['month'] = [v.month for v in postsdf.date]

# add count 
ymdf = pd.DataFrame({'count': postsdf.groupby(['date']).size()})
ymdf = ymdf.reset_index()
ymdf['year'] = [v.year for v in ymdf.date]
ymdf['month'] = [v.month for v in ymdf.date]

# display a table of counts per month per year
ymdf[['year', 'month', 'count']].pivot(index='year', columns='month', values='count')

Out[6]:

month	1	2	3	4	5	6	7	8	9	10	11	12
year
2011	88	150	27	218	217	257	224	230	191	198	230	203
2012	149	201	251	201	231	227	194	245	214	248	221	230
2013	192	219	291	343	323	263	292	309	239	426	298	263
2014	223	330	340	356	389	NaN	NaN	NaN	NaN	NaN	NaN	NaN

In [7]:

# get unique years in the DataFrame
years = postsdf['year'].unique()

# start a wide matplotlib figure
fig = plt.figure(figsize=(15, 3))

# plot all the data
ax = fig.add_subplot(121)
ymdf[['date', 'count']].set_index('date').plot(ax=ax)
ax.legend(loc=4)
ax.set_title("Number of Posts Each Month Since January 2011")

# plot data split out by year
ax = fig.add_subplot(122)
df = ymdf[['count', 'year', 'month']].pivot('month', 'year')
# display(df)
df.plot(ax=ax)

ax.legend(loc=4)
ax.set_title("Split Out Per Year")

plt.show()

¶

Days of the week¶

In [ ]:

# postsdf['weekday'] =  [d.strftime('%a') for d in postsdf['date']]
postsdf['weekday'] = [d.weekday() for d in postsdf['date']]

mar_2011 = datetime.date(2011, 3, 1)
after_mar_2011 = postsdf[postsdf['date'] > mar_2011]

posts_date_day = after_mar_2011[['date', 'weekday']]

grouped = posts_date_day.groupby(['date', 'weekday'])
# alltextdf = pd.DataFrame({'post_count': grouped.size(), 'alltext': grouped['post'].apply(merge)})
# byweekday = pd.DataFrame({'weekday': grouped['weekday']})
# len(grouped.groups.keys())
a = pd.DataFrame(grouped.size())
a = a.reset_index()
b = a.groupby('weekday')
c = b.mean()
#c

Term Document Matrix¶

load up some stopwords
read in our posts from CSV

In [8]:

stopwords = open('stopwords').readlines()
stopwords = [w.strip() for w in stopwords]

postsdf = pd.read_csv(posts_savefile, parse_dates=[0])#[:2500]

postsdf.tail(3)

Out[8]:

	date	user	post
9938	2014-05-01 00:00:00	jasonlotito	MeetMe - New Hope, PA (near Philadelphia, Penn...
9939	2014-05-01 00:00:00	ssharpe67	Datalex - Atlanta, GA\nReady to use your tech ...
9940	2014-05-01 00:00:00	findwork	Disclaimer: Forgive me for posting here. I jus...

merge merges all posts into one document
words_in_post cleans up document splits words and counts them

In [9]:

import re
postsdf2 = postsdf.drop('user', axis=1)

def merge(v):
    return ' '.join(v)

def words_in_post(post):

    post = re.sub(r'[\. |, |\-|/|\(|\)|;|\[|\]|:|!|"|?|=|_|0-9]', ' ', post)
    words = post.lower().split()
    words = [word for word in words if word and word not in stopwords]
    
    word_counts = {}
    
    for word in words:
        word_counts[word] = word_counts.get(word, 0) + 1
        
    return word_counts

loop over all documents and create a TDM DataFrame
Save it to CSV

In [10]:

grouped = postsdf2.groupby(['date'])
alltextdf = pd.DataFrame({'post_count': grouped.size(), 'alltext': grouped['post'].apply(merge)})
#postsdf['year'] = [v.year for v in postsdf.date]
#postsdf['month'] = [v.month for v in postsdf.date]

alltextdf = alltextdf.reset_index()

# loop over month/years and extract words for each combo
tdm_df = None
for i in range(len(alltextdf)):
    words = words_in_post(alltextdf['alltext'][i])
    date = alltextdf['date'][i]
    year = date.year
    month =date.month
    post_count = alltextdf['post_count'][i]
    word_count = len(words.keys())

    df = pd.DataFrame([(date, year, month, k, words[k], post_count, word_count) for k in words], 
                        columns=['date', 'year', 'month', 'term', 'count', 'post_count', 'word_count'])
    if type(tdm_df) != pd.DataFrame :
        tdm_df = df
    else:
        tdm_df = pd.concat([tdm_df, df])

tdm_df['prop'] = 1.0 * tdm_df['count'] / tdm_df['post_count']
tdm_df.to_csv(tdm_savefile, index=False)

Which terms are popular each month?¶

Read in our TDM from CSV

In [11]:

tdm_df = pd.read_csv(tdm_savefile, parse_dates=[0])
tdm_df.head(2)
#display(tdm_df.tail(2))

Out[11]:

	date	year	month	term	count	post_count	word_count	prop
0	2011-01-01 00:00:00	2011	1	secondly	1	88	1745	0.011364
1	2011-01-01 00:00:00	2011	1	sbnation	2	88	1745	0.022727

In [14]:

import itertools
linecycler = itertools.cycle(['-', '--', ':'])

fig = plt.figure(figsize=(15, 8))
ax = fig.add_subplot(111)

terms = sorted(['java', 'php', 'python', 'rails', 'django', 'hadoop', 'ember', 'angularjs', 'meteor', 'javascript'])

# pull out only the terms we care about
df = tdm_df[tdm_df.term.isin(terms)][['date', 'term', 'prop']]

for p in terms: #df.columns[1:2]:
    subdf = df[df['term'] == p][['date', 'prop']]
    subdf = subdf.set_index(['date'])
    ax.plot(subdf.index, subdf.values, linestyle=next(linecycler), label=p, linewidth=9)

plt.legend(loc=2)
plt.show()

In [ ]: